001 /* 002 * SmithWaterman.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 import java.io.Reader; 035 import java.io.IOException; 036 037 /** 038 * This class implement the classic local alignment algorithm (with linear gap penalty 039 * function) due to T.F.Smith and M.S.Waterman (1981). 040 * 041 * <P>This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for 042 * global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given 043 * two sequences A and B of sizes n and m, respectively. However, unlike in the global 044 * alignment case, every position M[i,j] in the matrix contains the similarity score of 045 * <B>suffixes</B> of A[1..i] and B[1..j].</P> 046 * 047 * <P>Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method 048 * computes each position M[i,j] with the following recurrence:</P> 049 * 050 * <CODE><BLOCKQUOTE><PRE> 051 * M[0,0] = <B>M[0,j]</B> = <B>M[i,0]</B> = 0 052 * M[i,j] = max { M[i,j-1] + scoreInsertion (B[j]), 053 * M[i-1,j-1] + scoreSubstitution (A[i], B[j]), 054 * M[i-1,j] + scoreDeletion(A[i]) } 055 * </PRE></BLOCKQUOTE></CODE> 056 * 057 * <P>Note that, here, all cells in the first row and column are set to zero. The best 058 * local alignment score is the highest value found anywhere in the matrix.</P> 059 * 060 * <P>Just like in global alignment case, this algorithm has quadratic space complexity 061 * because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of 062 * computing each cell is constant, it also has quadratic time complexity.</P> 063 * 064 * <P>After the matrix has been computed, the alignment can be retrieved by tracing a path 065 * back in the matrix from the position of the highest score until a cell of value zero is 066 * reached. This step is performed by the {@link #buildOptimalAlignment 067 * buildOptimalAlignment} method, and its time complexity is linear on the size of the 068 * alignment. 069 * 070 * <P>If the similarity value only is needed (and not the alignment itself), it is easy to 071 * reduce the space requirement to O(n) by keeping just the last row or column in memory. 072 * This is precisely what is done by the {@link #computeScore computeScore} method. Note 073 * that it still requires O(n<SUP>2</SUP>) time.</P> 074 * 075 * <P>For a more efficient approach to the local alignment problem, see the 076 * {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the 077 * {@linkplain NeedlemanWunsch} algorithm.</P> 078 * 079 * @author Sergio A. de Carvalho Jr. 080 * @see NeedlemanWunsch 081 * @see CrochemoreLandauZivUkelson 082 * @see CrochemoreLandauZivUkelsonLocalAlignment 083 * @see CrochemoreLandauZivUkelsonGlobalAlignment 084 */ 085 public class SmithWaterman extends PairwiseAlignmentAlgorithm 086 { 087 /** 088 * The first sequence of an alignment. 089 */ 090 protected CharSequence seq1; 091 092 /** 093 * The second sequence of an alignment. 094 */ 095 protected CharSequence seq2; 096 097 /** 098 * The dynamic programming matrix. Each position (i, j) represents the best score 099 * between a suffic of the firsts i characters of <CODE>seq1</CODE> and a suffix of 100 * the first j characters of <CODE>seq2</CODE>. 101 */ 102 protected int[][] matrix; 103 104 /** 105 * Indicate the row of where an optimal local alignment can be found in the matrix.. 106 */ 107 protected int max_row; 108 109 /** 110 * Indicate the column of where an optimal local alignment can be found in the matrix. 111 */ 112 protected int max_col; 113 114 /** 115 * Loads sequences into {@linkplain CharSequence} instances. In case of any error, an 116 * exception is raised by the constructor of <CODE>CharSequence</CODE> (please check 117 * the specification of that class for specific requirements). 118 * 119 * @param input1 Input for first sequence 120 * @param input2 Input for second sequence 121 * @throws IOException If an I/O error occurs when reading the sequences 122 * @throws InvalidSequenceException If the sequences are not valid 123 * @see CharSequence 124 */ 125 protected void loadSequencesInternal (Reader input1, Reader input2) 126 throws IOException, InvalidSequenceException 127 { 128 // load sequences into instances of CharSequence 129 this.seq1 = new CharSequence(input1); 130 this.seq2 = new CharSequence(input2); 131 } 132 133 /** 134 * Frees pointers to loaded sequences and the dynamic programming matrix so that their 135 * data can be garbage collected. 136 */ 137 protected void unloadSequencesInternal () 138 { 139 this.seq1 = null; 140 this.seq2 = null; 141 this.matrix = null; 142 } 143 144 /** 145 * Builds an optimal local alignment between the loaded sequences after computing the 146 * dynamic programming matrix. It calls the <CODE>buildOptimalAlignment</CODE> method 147 * after the <CODE>computeMatrix</CODE> method computes the dynamic programming 148 * matrix. 149 * 150 * @return an optimal pairwise alignment between the loaded sequences 151 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 152 * with the loaded sequences. 153 * @see #computeMatrix 154 * @see #buildOptimalAlignment 155 */ 156 protected PairwiseAlignment computePairwiseAlignment () 157 throws IncompatibleScoringSchemeException 158 { 159 // compute the matrix 160 computeMatrix (); 161 162 // build and return an optimal local alignment 163 PairwiseAlignment alignment = buildOptimalAlignment (); 164 165 // allow the matrix to be garbage collected 166 matrix = null; 167 168 return alignment; 169 } 170 171 /** 172 * Computes the dynamic programming matrix. 173 * 174 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 175 * with the loaded sequences. 176 */ 177 protected void computeMatrix () throws IncompatibleScoringSchemeException 178 { 179 int r, c, rows, cols, ins, sub, del, max_score; 180 181 rows = seq1.length()+1; 182 cols = seq2.length()+1; 183 184 matrix = new int [rows][cols]; 185 186 // initiate first row 187 for (c = 0; c < cols; c++) 188 matrix[0][c] = 0; 189 190 // keep track of the maximum score 191 this.max_row = this.max_col = max_score = 0; 192 193 // calculates the similarity matrix (row-wise) 194 for (r = 1; r < rows; r++) 195 { 196 // initiate first column 197 matrix[r][0] = 0; 198 199 for (c = 1; c < cols; c++) 200 { 201 ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c)); 202 sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c)); 203 del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r)); 204 205 // choose the greatest 206 matrix[r][c] = max (ins, sub, del, 0); 207 208 if (matrix[r][c] > max_score) 209 { 210 // keep track of the maximum score 211 max_score = matrix[r][c]; 212 this.max_row = r; this.max_col = c; 213 } 214 } 215 } 216 } 217 218 /** 219 * Builds an optimal local alignment between the loaded sequences. Before it is 220 * executed, the dynamic programming matrix must already have been computed by 221 * the <CODE>computeMatrix</CODE> method. 222 * 223 * @return an optimal local alignment between the loaded sequences 224 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 225 * with the loaded sequences. 226 * @see #computeMatrix 227 */ 228 protected PairwiseAlignment buildOptimalAlignment () throws 229 IncompatibleScoringSchemeException 230 { 231 StringBuffer gapped_seq1, score_tag_line, gapped_seq2; 232 int r, c, max_score, sub; 233 234 // start at the cell with maximum score 235 r = this.max_row; 236 c = this.max_col; 237 238 max_score = matrix[r][c]; 239 240 gapped_seq1 = new StringBuffer(); 241 score_tag_line = new StringBuffer(); 242 gapped_seq2 = new StringBuffer(); 243 244 while ((r > 0 || c > 0) && (matrix[r][c] > 0)) 245 { 246 if (c > 0) 247 if (matrix[r][c] == matrix[r][c-1] + scoreInsertion(seq2.charAt(c))) 248 { 249 // insertion 250 gapped_seq1.insert (0, GAP_CHARACTER); 251 score_tag_line.insert (0, GAP_TAG); 252 gapped_seq2.insert (0, seq2.charAt(c)); 253 254 c = c - 1; 255 256 // skip to the next iteration 257 continue; 258 } 259 260 if ((r > 0) && (c > 0)) 261 { 262 sub = scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); 263 264 if (matrix[r][c] == matrix[r-1][c-1] + sub) 265 { 266 // substitution 267 gapped_seq1.insert (0, seq1.charAt(r)); 268 if (seq1.charAt(r) == seq2.charAt(c)) 269 if (useMatchTag()) 270 score_tag_line.insert (0, MATCH_TAG); 271 else 272 score_tag_line.insert (0, seq1.charAt(r)); 273 else if (sub > 0) 274 score_tag_line.insert (0, APPROXIMATE_MATCH_TAG); 275 else 276 score_tag_line.insert (0, MISMATCH_TAG); 277 gapped_seq2.insert (0, seq2.charAt(c)); 278 279 r = r - 1; c = c - 1; 280 281 // skip to the next iteration 282 continue; 283 } 284 } 285 286 // must be a deletion 287 gapped_seq1.insert (0, seq1.charAt(r)); 288 score_tag_line.insert (0, GAP_TAG); 289 gapped_seq2.insert (0,GAP_CHARACTER); 290 291 r = r - 1; 292 } 293 294 return new PairwiseAlignment (gapped_seq1.toString(), score_tag_line.toString(), 295 gapped_seq2.toString(), max_score); 296 } 297 298 /** 299 * Computes the score of the best local alignment between the two sequences using the 300 * scoring scheme previously set. This method calculates the similarity value only 301 * (doesn't build the whole matrix so the alignment cannot be recovered, however it 302 * has the advantage of requiring O(n) space only). 303 * 304 * @return the score of the best local alignment between the loaded sequences 305 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 306 * with the loaded sequences. 307 */ 308 protected int computeScore () throws IncompatibleScoringSchemeException 309 { 310 int[] array; 311 int rows = seq1.length()+1, cols = seq2.length()+1; 312 int r, c, tmp, ins, del, sub, max_score; 313 314 // keep track of the maximum score 315 max_score = 0; 316 317 if (rows <= cols) 318 { 319 // goes columnwise 320 array = new int [rows]; 321 322 // initiate first column 323 for (r = 0; r < rows; r++) 324 array[r] = 0; 325 326 // calculate the similarity matrix (keep current column only) 327 for (c = 1; c < cols; c++) 328 { 329 // set first position to zero (tmp hold values 330 // that will be later moved to the array) 331 tmp = 0; 332 333 for (r = 1; r < rows; r++) 334 { 335 ins = array[r] + scoreInsertion(seq2.charAt(c)); 336 sub = array[r-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); 337 del = tmp + scoreDeletion(seq1.charAt(r)); 338 339 // move the temp value to the array 340 array[r-1] = tmp; 341 342 // choose the greatest (or zero if all negative) 343 tmp = max (ins, sub, del, 0); 344 345 // keep track of the maximum score 346 if (tmp > max_score) max_score = tmp; 347 } 348 349 // move the temp value to the array 350 array[rows - 1] = tmp; 351 } 352 } 353 else 354 { 355 // goes rowwise 356 array = new int [cols]; 357 358 // initiate first row 359 for (c = 0; c < cols; c++) 360 array[c] = 0; 361 362 // calculate the similarity matrix (keep current row only) 363 for (r = 1; r < rows; r++) 364 { 365 // set first position to zero (tmp hold values 366 // that will be later moved to the array) 367 tmp = 0; 368 369 for (c = 1; c < cols; c++) 370 { 371 ins = tmp + scoreInsertion(seq2.charAt(c)); 372 sub = array[c-1] + scoreSubstitution(seq1.charAt(r), seq2.charAt(c)); 373 del = array[c] + scoreDeletion(seq1.charAt(r)); 374 375 // move the temp value to the array 376 array[c-1] = tmp; 377 378 // choose the greatest (or zero if all negative) 379 tmp = max (ins, sub, del, 0); 380 381 // keep track of the maximum score 382 if (tmp > max_score) max_score = tmp; 383 } 384 385 // move the temp value to the array 386 array[cols - 1] = tmp; 387 } 388 } 389 390 return max_score; 391 } 392 }